# Set Seed to Save Models:
set.seed(1)
# check head of dataframe:
head(data, n = 10)
# Look at structure of dataframe:
str(data)
## 'data.frame': 2925 obs. of 74 variables:
## $ MS.Zoning : chr "RL" "RH" "RL" "RL" ...
## $ Lot.Frontage : int 141 80 81 93 74 78 41 43 39 60 ...
## $ Lot.Area : int 31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr "None" "None" "None" "None" ...
## $ Lot.Shape : chr "IR1" "Reg" "IR1" "Reg" ...
## $ Land.Contour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Lot.Config : chr "Corner" "Inside" "Corner" "Corner" ...
## $ Land.Slope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Condition.1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition.2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ Bldg.Type : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ House.Style : chr "1Story" "1Story" "1Story" "1Story" ...
## $ Overall.Qual : int 6 5 6 7 5 6 8 8 8 7 ...
## $ Overall.Cond : int 5 6 6 5 5 6 5 5 5 5 ...
## $ Year.Built : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Year.Remod.Add : int 1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
## $ Roof.Style : chr "Hip" "Gable" "Hip" "Hip" ...
## $ Roof.Matl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior.1st : chr "BrkFace" "VinylSd" "Wood" "BrkFace" ...
## $ Exterior.2nd : chr "Plywood" "VinylSd" "wood" "BrkFace" ...
## $ Mas.Vnr.Type : chr "Stone" "None" "BrkFace" "None" ...
## $ Mas.Vnr.Area : int 112 0 108 0 0 20 0 0 0 0 ...
## $ Exter.Qual : chr "TA" "TA" "TA" "Gd" ...
## $ Exter.Cond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "CBlock" "CBlock" "CBlock" "CBlock" ...
## $ Bsmt.Qual : chr "TA" "TA" "TA" "TA" ...
## $ Bsmt.Cond : chr "Gd" "TA" "TA" "TA" ...
## $ Bsmt.Exposure : chr "Gd" "No" "No" "No" ...
## $ BsmtFin.Type.1 : chr "BLQ" "Rec" "ALQ" "ALQ" ...
## $ BsmtFin.SF.1 : int 639 468 923 1065 791 602 616 263 1180 0 ...
## $ BsmtFin.Type.2 : chr "Unf" "LwQ" "Unf" "Unf" ...
## $ BsmtFin.SF.2 : int 0 144 0 0 0 0 0 0 0 0 ...
## $ Bsmt.Unf.SF : int 441 270 406 1045 137 324 722 1017 415 994 ...
## $ Total.Bsmt.SF : int 1080 882 1329 2110 928 926 1338 1280 1595 994 ...
## $ Heating : chr "Gas" "Gas" "Gas" "Gas" ...
## $ Heating.QC : chr "Fa" "TA" "TA" "Ex" ...
## $ Central.Air : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1st.Flr.SF : int 1656 896 1329 2110 928 926 1338 1280 1616 1028 ...
## $ X2nd.Flr.SF : int 0 0 0 0 701 678 0 0 0 776 ...
## $ Low.Qual.Fin.SF: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Gr.Liv.Area : int 1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
## $ Bsmt.Full.Bath : int 1 0 0 1 0 0 1 0 1 0 ...
## $ Bsmt.Half.Bath : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Full.Bath : int 1 1 1 2 2 2 2 2 2 2 ...
## $ Half.Bath : int 0 0 1 1 1 1 0 0 0 1 ...
## $ Bedroom.AbvGr : int 3 2 3 3 3 3 2 2 2 3 ...
## $ Kitchen.AbvGr : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Kitchen.Qual : chr "TA" "TA" "Gd" "Ex" ...
## $ TotRms.AbvGrd : int 7 5 6 8 6 7 6 5 5 7 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 2 0 0 2 1 1 0 0 1 1 ...
## $ Fireplace.Qu : chr "Gd" "None" "None" "TA" ...
## $ Garage.Type : chr "Attchd" "Attchd" "Attchd" "Attchd" ...
## $ Garage.Yr.Blt : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Garage.Finish : chr "Fin" "Unf" "Unf" "Fin" ...
## $ Garage.Cars : int 2 1 1 2 2 2 2 2 2 2 ...
## $ Garage.Area : int 528 730 312 522 482 470 582 506 608 442 ...
## $ Garage.Qual : chr "TA" "TA" "TA" "TA" ...
## $ Garage.Cond : chr "TA" "TA" "TA" "TA" ...
## $ Paved.Drive : chr "P" "Y" "Y" "Y" ...
## $ Wood.Deck.SF : int 210 140 393 0 212 360 0 0 237 140 ...
## $ Open.Porch.SF : int 62 0 36 0 34 36 0 82 152 60 ...
## $ Enclosed.Porch : int 0 0 0 0 0 0 170 0 0 0 ...
## $ X3Ssn.Porch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Screen.Porch : int 0 120 0 0 0 0 0 144 0 0 ...
## $ Pool.Area : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Fence : chr "None" "MnPrv" "None" "None" ...
## $ Misc.Val : int 0 0 12500 0 0 0 0 0 0 0 ...
## $ Yr.Sold : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ Sale.Type : chr "WD " "WD " "WD " "WD " ...
## $ Sale.Condition : chr "Normal" "Normal" "Normal" "Normal" ...
## $ SalePrice : int 215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...
# Look at missing values in dataframe:
summary(data)
## MS.Zoning Lot.Frontage Lot.Area Street
## Length:2925 Min. : 21.00 Min. : 1300 Length:2925
## Class :character 1st Qu.: 58.00 1st Qu.: 7438 Class :character
## Mode :character Median : 68.00 Median : 9428 Mode :character
## Mean : 69.07 Mean : 10108
## 3rd Qu.: 80.00 3rd Qu.: 11520
## Max. :313.00 Max. :215245
## NA's :490
## Alley Lot.Shape Land.Contour Lot.Config
## Length:2925 Length:2925 Length:2925 Length:2925
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Land.Slope Condition.1 Condition.2 Bldg.Type
## Length:2925 Length:2925 Length:2925 Length:2925
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## House.Style Overall.Qual Overall.Cond Year.Built
## Length:2925 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.094 Mean :5.565 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2001
## Max. :10.000 Max. :9.000 Max. :2010
##
## Year.Remod.Add Roof.Style Roof.Matl Exterior.1st
## Min. :1950 Length:2925 Length:2925 Length:2925
## 1st Qu.:1965 Class :character Class :character Class :character
## Median :1993 Mode :character Mode :character Mode :character
## Mean :1984
## 3rd Qu.:2004
## Max. :2010
##
## Exterior.2nd Mas.Vnr.Type Mas.Vnr.Area Exter.Qual
## Length:2925 Length:2925 Min. : 0.0 Length:2925
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 101.1
## 3rd Qu.: 164.0
## Max. :1600.0
## NA's :23
## Exter.Cond Foundation Bsmt.Qual Bsmt.Cond
## Length:2925 Length:2925 Length:2925 Length:2925
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Bsmt.Exposure BsmtFin.Type.1 BsmtFin.SF.1 BsmtFin.Type.2
## Length:2925 Length:2925 Min. : 0.0 Length:2925
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 370.0 Mode :character
## Mean : 439.3
## 3rd Qu.: 734.0
## Max. :2288.0
## NA's :1
## BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0 Length:2925
## 1st Qu.: 0.00 1st Qu.: 219.0 1st Qu.: 793 Class :character
## Median : 0.00 Median : 464.5 Median : 990 Mode :character
## Mean : 49.81 Mean : 559.2 Mean :1048
## 3rd Qu.: 0.00 3rd Qu.: 801.2 3rd Qu.:1300
## Max. :1526.00 Max. :2336.0 Max. :3206
## NA's :1 NA's :1 NA's :1
## Heating.QC Central.Air Electrical X1st.Flr.SF
## Length:2925 Length:2925 Length:2925 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 877
## Mode :character Mode :character Mode :character Median :1084
## Mean :1157
## 3rd Qu.:1383
## Max. :3820
##
## X2nd.Flr.SF Low.Qual.Fin.SF Gr.Liv.Area Bsmt.Full.Bath
## Min. : 0.0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.:1126 1st Qu.:0.0000
## Median : 0.0 Median : 0.000 Median :1442 Median :0.0000
## Mean : 335.2 Mean : 4.685 Mean :1496 Mean :0.4307
## 3rd Qu.: 703.0 3rd Qu.: 0.000 3rd Qu.:1742 3rd Qu.:1.0000
## Max. :2065.0 Max. :1064.000 Max. :4476 Max. :3.0000
## NA's :2
## Bsmt.Half.Bath Full.Bath Half.Bath Bedroom.AbvGr
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.0000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.0609 Mean :1.566 Mean :0.3791 Mean :2.855
## 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.0000 Max. :4.000 Max. :2.0000 Max. :8.000
## NA's :2
## Kitchen.AbvGr Kitchen.Qual TotRms.AbvGrd Functional
## Min. :0.000 Length:2925 Min. : 2.000 Length:2925
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.044 Mean : 6.438
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces Fireplace.Qu Garage.Type Garage.Yr.Blt
## Min. :0.0000 Length:2925 Length:2925 Min. :1895
## 1st Qu.:0.0000 Class :character Class :character 1st Qu.:1960
## Median :1.0000 Mode :character Mode :character Median :1979
## Mean :0.5979 Mean :1978
## 3rd Qu.:1.0000 3rd Qu.:2002
## Max. :4.0000 Max. :2207
## NA's :159
## Garage.Finish Garage.Cars Garage.Area Garage.Qual
## Length:2925 Min. :0.000 Min. : 0 Length:2925
## Class :character 1st Qu.:1.000 1st Qu.: 320 Class :character
## Mode :character Median :2.000 Median : 480 Mode :character
## Mean :1.766 Mean : 472
## 3rd Qu.:2.000 3rd Qu.: 576
## Max. :5.000 Max. :1488
## NA's :1 NA's :1
## Garage.Cond Paved.Drive Wood.Deck.SF Open.Porch.SF
## Length:2925 Length:2925 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 27.00
## Mean : 93.58 Mean : 47.21
## 3rd Qu.: 168.00 3rd Qu.: 70.00
## Max. :1424.00 Max. :742.00
##
## Enclosed.Porch X3Ssn.Porch Screen.Porch Pool.Area
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.000 Median : 0.00 Median : 0.000
## Mean : 23.05 Mean : 2.597 Mean : 16.03 Mean : 2.083
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :1012.00 Max. :508.000 Max. :576.00 Max. :800.000
##
## Fence Misc.Val Yr.Sold Sale.Type
## Length:2925 Min. : 0.00 Min. :2006 Length:2925
## Class :character 1st Qu.: 0.00 1st Qu.:2007 Class :character
## Mode :character Median : 0.00 Median :2008 Mode :character
## Mean : 44.91 Mean :2008
## 3rd Qu.: 0.00 3rd Qu.:2009
## Max. :15500.00 Max. :2010
##
## Sale.Condition SalePrice
## Length:2925 Min. : 34900
## Class :character 1st Qu.:129500
## Mode :character Median :160000
## Mean :180916
## 3rd Qu.:213500
## Max. :755000
##
# Remove Missing Values from Data:
data2= na.omit(data)
# check structure of dataframe:
str(data2)
## 'data.frame': 2258 obs. of 74 variables:
## $ MS.Zoning : chr "RL" "RH" "RL" "RL" ...
## $ Lot.Frontage : int 141 80 81 93 74 78 41 43 39 60 ...
## $ Lot.Area : int 31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr "None" "None" "None" "None" ...
## $ Lot.Shape : chr "IR1" "Reg" "IR1" "Reg" ...
## $ Land.Contour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Lot.Config : chr "Corner" "Inside" "Corner" "Corner" ...
## $ Land.Slope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Condition.1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition.2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ Bldg.Type : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ House.Style : chr "1Story" "1Story" "1Story" "1Story" ...
## $ Overall.Qual : int 6 5 6 7 5 6 8 8 8 7 ...
## $ Overall.Cond : int 5 6 6 5 5 6 5 5 5 5 ...
## $ Year.Built : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Year.Remod.Add : int 1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
## $ Roof.Style : chr "Hip" "Gable" "Hip" "Hip" ...
## $ Roof.Matl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior.1st : chr "BrkFace" "VinylSd" "Wood" "BrkFace" ...
## $ Exterior.2nd : chr "Plywood" "VinylSd" "wood" "BrkFace" ...
## $ Mas.Vnr.Type : chr "Stone" "None" "BrkFace" "None" ...
## $ Mas.Vnr.Area : int 112 0 108 0 0 20 0 0 0 0 ...
## $ Exter.Qual : chr "TA" "TA" "TA" "Gd" ...
## $ Exter.Cond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "CBlock" "CBlock" "CBlock" "CBlock" ...
## $ Bsmt.Qual : chr "TA" "TA" "TA" "TA" ...
## $ Bsmt.Cond : chr "Gd" "TA" "TA" "TA" ...
## $ Bsmt.Exposure : chr "Gd" "No" "No" "No" ...
## $ BsmtFin.Type.1 : chr "BLQ" "Rec" "ALQ" "ALQ" ...
## $ BsmtFin.SF.1 : int 639 468 923 1065 791 602 616 263 1180 0 ...
## $ BsmtFin.Type.2 : chr "Unf" "LwQ" "Unf" "Unf" ...
## $ BsmtFin.SF.2 : int 0 144 0 0 0 0 0 0 0 0 ...
## $ Bsmt.Unf.SF : int 441 270 406 1045 137 324 722 1017 415 994 ...
## $ Total.Bsmt.SF : int 1080 882 1329 2110 928 926 1338 1280 1595 994 ...
## $ Heating : chr "Gas" "Gas" "Gas" "Gas" ...
## $ Heating.QC : chr "Fa" "TA" "TA" "Ex" ...
## $ Central.Air : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1st.Flr.SF : int 1656 896 1329 2110 928 926 1338 1280 1616 1028 ...
## $ X2nd.Flr.SF : int 0 0 0 0 701 678 0 0 0 776 ...
## $ Low.Qual.Fin.SF: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Gr.Liv.Area : int 1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
## $ Bsmt.Full.Bath : int 1 0 0 1 0 0 1 0 1 0 ...
## $ Bsmt.Half.Bath : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Full.Bath : int 1 1 1 2 2 2 2 2 2 2 ...
## $ Half.Bath : int 0 0 1 1 1 1 0 0 0 1 ...
## $ Bedroom.AbvGr : int 3 2 3 3 3 3 2 2 2 3 ...
## $ Kitchen.AbvGr : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Kitchen.Qual : chr "TA" "TA" "Gd" "Ex" ...
## $ TotRms.AbvGrd : int 7 5 6 8 6 7 6 5 5 7 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 2 0 0 2 1 1 0 0 1 1 ...
## $ Fireplace.Qu : chr "Gd" "None" "None" "TA" ...
## $ Garage.Type : chr "Attchd" "Attchd" "Attchd" "Attchd" ...
## $ Garage.Yr.Blt : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Garage.Finish : chr "Fin" "Unf" "Unf" "Fin" ...
## $ Garage.Cars : int 2 1 1 2 2 2 2 2 2 2 ...
## $ Garage.Area : int 528 730 312 522 482 470 582 506 608 442 ...
## $ Garage.Qual : chr "TA" "TA" "TA" "TA" ...
## $ Garage.Cond : chr "TA" "TA" "TA" "TA" ...
## $ Paved.Drive : chr "P" "Y" "Y" "Y" ...
## $ Wood.Deck.SF : int 210 140 393 0 212 360 0 0 237 140 ...
## $ Open.Porch.SF : int 62 0 36 0 34 36 0 82 152 60 ...
## $ Enclosed.Porch : int 0 0 0 0 0 0 170 0 0 0 ...
## $ X3Ssn.Porch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Screen.Porch : int 0 120 0 0 0 0 0 144 0 0 ...
## $ Pool.Area : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Fence : chr "None" "MnPrv" "None" "None" ...
## $ Misc.Val : int 0 0 12500 0 0 0 0 0 0 0 ...
## $ Yr.Sold : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ Sale.Type : chr "WD " "WD " "WD " "WD " ...
## $ Sale.Condition : chr "Normal" "Normal" "Normal" "Normal" ...
## $ SalePrice : int 215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...
## - attr(*, "na.action")= 'omit' Named int [1:667] 12 15 23 24 25 28 56 58 59 67 ...
## ..- attr(*, "names")= chr [1:667] "12" "15" "23" "24" ...
# Split Training Set 70/30
train <- sample(2258,1800)
test <- (c(1:2258)[-train])
# Create a data frame with continuous variables only:
num.ames=data.frame(data2[,c(2,3,14:17,23,31,33:35,40:49,51,53,56,58,59,63:68,70,71,74)])
# Checking Data Correlation and Distribution:
plot(SalePrice ~., data = num.ames, subset = train)
# Modeling:
# Create First Model:
fit <- lm(SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + TotRms.AbvGrd +
Garage.Yr.Blt + Wood.Deck.SF + Open.Porch.SF, data = num.ames, subset = train)
# Return model summary of first model:
summary(fit)
##
## Call:
## lm(formula = SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add +
## BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area +
## TotRms.AbvGrd + Garage.Yr.Blt + Wood.Deck.SF + Open.Porch.SF,
## data = num.ames, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -136957 -18417 -1373 16771 230613
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.119e+06 9.574e+04 -11.689 < 2e-16 ***
## Overall.Qual 1.966e+04 9.020e+02 21.791 < 2e-16 ***
## Year.Built 1.537e+02 4.895e+01 3.141 0.001714 **
## Year.Remod.Add 2.550e+02 5.246e+01 4.860 1.27e-06 ***
## BsmtFin.SF.1 3.251e+01 2.062e+00 15.767 < 2e-16 ***
## Total.Bsmt.SF 2.694e+01 3.472e+00 7.759 1.43e-14 ***
## X1st.Flr.SF 1.274e+01 3.754e+00 3.393 0.000706 ***
## Gr.Liv.Area 6.844e+01 3.282e+00 20.853 < 2e-16 ***
## TotRms.AbvGrd -1.002e+03 8.885e+02 -1.128 0.259577
## Garage.Yr.Blt 1.097e+02 5.597e+01 1.960 0.050202 .
## Wood.Deck.SF 1.843e+01 6.779e+00 2.718 0.006621 **
## Open.Porch.SF 1.395e+01 1.291e+01 1.080 0.280151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32820 on 1788 degrees of freedom
## Multiple R-squared: 0.8528, Adjusted R-squared: 0.8519
## F-statistic: 941.6 on 11 and 1788 DF, p-value: < 2.2e-16
# Create Second Model:
fit2 <- lm(SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + TotRms.AbvGrd +
Garage.Yr.Blt + Wood.Deck.SF, data = num.ames, subset = train)
# Return model summary of second model:
summary(fit2)
##
## Call:
## lm(formula = SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add +
## BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area +
## TotRms.AbvGrd + Garage.Yr.Blt + Wood.Deck.SF, data = num.ames,
## subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -137514 -18485 -1627 16894 231403
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.136e+06 9.446e+04 -12.026 < 2e-16 ***
## Overall.Qual 1.966e+04 9.020e+02 21.796 < 2e-16 ***
## Year.Built 1.542e+02 4.895e+01 3.151 0.001656 **
## Year.Remod.Add 2.605e+02 5.221e+01 4.991 6.60e-07 ***
## BsmtFin.SF.1 3.251e+01 2.062e+00 15.765 < 2e-16 ***
## Total.Bsmt.SF 2.723e+01 3.462e+00 7.866 6.26e-15 ***
## X1st.Flr.SF 1.256e+01 3.750e+00 3.349 0.000827 ***
## Gr.Liv.Area 6.900e+01 3.241e+00 21.292 < 2e-16 ***
## TotRms.AbvGrd -1.034e+03 8.881e+02 -1.164 0.244550
## Garage.Yr.Blt 1.121e+02 5.593e+01 2.004 0.045181 *
## Wood.Deck.SF 1.775e+01 6.750e+00 2.630 0.008619 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32820 on 1789 degrees of freedom
## Multiple R-squared: 0.8527, Adjusted R-squared: 0.8519
## F-statistic: 1036 on 10 and 1789 DF, p-value: < 2.2e-16
# Create Third Model:
fit3 <- lm(SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + Garage.Yr.Blt + Wood.Deck.SF, data = num.ames, subset = train)
# Return model summary of third model:
summary(fit3)
##
## Call:
## lm(formula = SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add +
## BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area +
## Garage.Yr.Blt + Wood.Deck.SF, data = num.ames, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -137509 -18564 -1665 17169 229284
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.140e+06 9.442e+04 -12.070 < 2e-16 ***
## Overall.Qual 1.974e+04 8.993e+02 21.955 < 2e-16 ***
## Year.Built 1.534e+02 4.895e+01 3.134 0.001749 **
## Year.Remod.Add 2.620e+02 5.220e+01 5.020 5.67e-07 ***
## BsmtFin.SF.1 3.285e+01 2.041e+00 16.091 < 2e-16 ***
## Total.Bsmt.SF 2.725e+01 3.462e+00 7.872 5.99e-15 ***
## X1st.Flr.SF 1.258e+01 3.751e+00 3.353 0.000816 ***
## Gr.Liv.Area 6.619e+01 2.160e+00 30.642 < 2e-16 ***
## Garage.Yr.Blt 1.116e+02 5.593e+01 1.995 0.046140 *
## Wood.Deck.SF 1.787e+01 6.750e+00 2.648 0.008169 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32830 on 1790 degrees of freedom
## Multiple R-squared: 0.8526, Adjusted R-squared: 0.8518
## F-statistic: 1150 on 9 and 1790 DF, p-value: < 2.2e-16
# Plot Fitted vs Residuals:
plot(fit3$res~fit3$fitted, main = "Fitted vs Residuals")
# Check normality of model:
hist(fit3$res, main = "Normality Test",
col = c("blue", "red", "green"))
# Plot qq-plot:
qqnorm((fit3$res))
# add reference line:
qqline(fit3$res)
# Compute Shapiro-Wilk Test for Normality Check:
shapiro.test(fit3$res)
##
## Shapiro-Wilk normality test
##
## data: fit3$res
## W = 0.93102, p-value < 2.2e-16
# Run boxcox transformation to help normalize data:
boxcox(SalePrice~Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + TotRms.AbvGrd +
Garage.Yr.Blt + Wood.Deck.SF, data = num.ames)
# Create new variable that is the log of SalesPrice:
SalePriceLog <- log(num.ames$SalePrice)
# Create new model using SalesPriceLog for the dependent variable:
fit4 <- lm(SalePriceLog~Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + TotRms.AbvGrd +
Garage.Yr.Blt + Wood.Deck.SF, data = num.ames)
# Plot Fitted vs Residual Values:
plot(fit4$res~fit4$fitted, main = "Diagnostic Check Model 4")
# Determine Categorical Variables For Model:
# Using anova: determine the categorical variables to use in the final model:
model1 = lm(SalePriceLog ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + Wood.Deck.SF + Open.Porch.SF, data = data2, subset = train)
model2 = lm(SalePriceLog ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + Wood.Deck.SF + Open.Porch.SF + Street, data = data2, subset = train)
anova(model1,model2)
str(data2)
## 'data.frame': 2258 obs. of 74 variables:
## $ MS.Zoning : chr "RL" "RH" "RL" "RL" ...
## $ Lot.Frontage : int 141 80 81 93 74 78 41 43 39 60 ...
## $ Lot.Area : int 31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr "None" "None" "None" "None" ...
## $ Lot.Shape : chr "IR1" "Reg" "IR1" "Reg" ...
## $ Land.Contour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Lot.Config : chr "Corner" "Inside" "Corner" "Corner" ...
## $ Land.Slope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Condition.1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition.2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ Bldg.Type : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ House.Style : chr "1Story" "1Story" "1Story" "1Story" ...
## $ Overall.Qual : int 6 5 6 7 5 6 8 8 8 7 ...
## $ Overall.Cond : int 5 6 6 5 5 6 5 5 5 5 ...
## $ Year.Built : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Year.Remod.Add : int 1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
## $ Roof.Style : chr "Hip" "Gable" "Hip" "Hip" ...
## $ Roof.Matl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior.1st : chr "BrkFace" "VinylSd" "Wood" "BrkFace" ...
## $ Exterior.2nd : chr "Plywood" "VinylSd" "wood" "BrkFace" ...
## $ Mas.Vnr.Type : chr "Stone" "None" "BrkFace" "None" ...
## $ Mas.Vnr.Area : int 112 0 108 0 0 20 0 0 0 0 ...
## $ Exter.Qual : chr "TA" "TA" "TA" "Gd" ...
## $ Exter.Cond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "CBlock" "CBlock" "CBlock" "CBlock" ...
## $ Bsmt.Qual : chr "TA" "TA" "TA" "TA" ...
## $ Bsmt.Cond : chr "Gd" "TA" "TA" "TA" ...
## $ Bsmt.Exposure : chr "Gd" "No" "No" "No" ...
## $ BsmtFin.Type.1 : chr "BLQ" "Rec" "ALQ" "ALQ" ...
## $ BsmtFin.SF.1 : int 639 468 923 1065 791 602 616 263 1180 0 ...
## $ BsmtFin.Type.2 : chr "Unf" "LwQ" "Unf" "Unf" ...
## $ BsmtFin.SF.2 : int 0 144 0 0 0 0 0 0 0 0 ...
## $ Bsmt.Unf.SF : int 441 270 406 1045 137 324 722 1017 415 994 ...
## $ Total.Bsmt.SF : int 1080 882 1329 2110 928 926 1338 1280 1595 994 ...
## $ Heating : chr "Gas" "Gas" "Gas" "Gas" ...
## $ Heating.QC : chr "Fa" "TA" "TA" "Ex" ...
## $ Central.Air : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1st.Flr.SF : int 1656 896 1329 2110 928 926 1338 1280 1616 1028 ...
## $ X2nd.Flr.SF : int 0 0 0 0 701 678 0 0 0 776 ...
## $ Low.Qual.Fin.SF: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Gr.Liv.Area : int 1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
## $ Bsmt.Full.Bath : int 1 0 0 1 0 0 1 0 1 0 ...
## $ Bsmt.Half.Bath : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Full.Bath : int 1 1 1 2 2 2 2 2 2 2 ...
## $ Half.Bath : int 0 0 1 1 1 1 0 0 0 1 ...
## $ Bedroom.AbvGr : int 3 2 3 3 3 3 2 2 2 3 ...
## $ Kitchen.AbvGr : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Kitchen.Qual : chr "TA" "TA" "Gd" "Ex" ...
## $ TotRms.AbvGrd : int 7 5 6 8 6 7 6 5 5 7 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 2 0 0 2 1 1 0 0 1 1 ...
## $ Fireplace.Qu : chr "Gd" "None" "None" "TA" ...
## $ Garage.Type : chr "Attchd" "Attchd" "Attchd" "Attchd" ...
## $ Garage.Yr.Blt : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Garage.Finish : chr "Fin" "Unf" "Unf" "Fin" ...
## $ Garage.Cars : int 2 1 1 2 2 2 2 2 2 2 ...
## $ Garage.Area : int 528 730 312 522 482 470 582 506 608 442 ...
## $ Garage.Qual : chr "TA" "TA" "TA" "TA" ...
## $ Garage.Cond : chr "TA" "TA" "TA" "TA" ...
## $ Paved.Drive : chr "P" "Y" "Y" "Y" ...
## $ Wood.Deck.SF : int 210 140 393 0 212 360 0 0 237 140 ...
## $ Open.Porch.SF : int 62 0 36 0 34 36 0 82 152 60 ...
## $ Enclosed.Porch : int 0 0 0 0 0 0 170 0 0 0 ...
## $ X3Ssn.Porch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Screen.Porch : int 0 120 0 0 0 0 0 144 0 0 ...
## $ Pool.Area : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Fence : chr "None" "MnPrv" "None" "None" ...
## $ Misc.Val : int 0 0 12500 0 0 0 0 0 0 0 ...
## $ Yr.Sold : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ Sale.Type : chr "WD " "WD " "WD " "WD " ...
## $ Sale.Condition : chr "Normal" "Normal" "Normal" "Normal" ...
## $ SalePrice : int 215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...
## - attr(*, "na.action")= 'omit' Named int [1:667] 12 15 23 24 25 28 56 58 59 67 ...
## ..- attr(*, "names")= chr [1:667] "12" "15" "23" "24" ...